import sys
import numpy as np
import pandas as pd
from Env_deterministic import FiniteStateFiniteActionMDP_deterministic
import pickle

from Qadv_swiching import Qlearning_gen_adv
n = int(sys.argv[1])

task = n//10 
idd = n%10

np.random.seed(1)
total_episodes = 3000000
c1 = np.sqrt(2)
c2 = 2
c3 = 1
using_adv_min = 200

H, S, A = 10, 5, 5
mdp_env = FiniteStateFiniteActionMDP_deterministic(H=H, S=S, A=A) 
for _ in range(n*(10**7) + 1):
    np.random.randint(S)

qadv = Qlearning_gen_adv(mdp_env, total_episodes, c1, c2, c3, using_adv_min)
best_value, best_Q, value, global_Q, raw_gap, cost = qadv.learn()
f = open('./result_switching_deterministic/result_' + str(n), 'wb')
pickle.dump((best_value, best_Q, value, global_Q, qadv), f)
f.close()
